
pacman::p_load(tidyverse,
               fs,
               glue,
               fst,
               tidymodels,
               doParallel,
               embed,
               tictoc,
               lubridate,
               glmnet,
               glmnetUtils,
               readxl
)

# Larry var mean ~450 CCP variables manually checked for credit scoring appropriateness
v_larry_cc_cd <- read_xlsx("../../data/larry_cc_cd_vars.xlsx") %>% 
  pull(var) %>% 
  unique()



path_smbinning <- paste0("../../data/pipeline_outputs/", 
                         SPECIAL_SUFFIX, "/", "smbinning_", TRAINING_SUFFIX,
                         "_", RAND_NO_CID_SMALLEST_LARGEST, "/")


df_training_raw <- read_fst(paste0("../../data/pipeline_outputs/", 
                                   SPECIAL_SUFFIX, "/", "train_",
                                   TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                   "training.fst"),
                            columns = c("cid", "qtr", "t_default", "riskscore", "rand_no_cid", 
                                        all_of(v_larry_cc_cd), "consumer_category")) %>% 
  filter(consumer_category %in% CONSUMER_GROUP,
         rand_no_cid %in% c("0000", "0001", "0002", "0003", "0004"))



df_training_outcome <- df_training_raw %>% 
  mutate(
    year = year(qtr),
    year_outcome = (10 * year) + t_default,
    t_default = factor(t_default, levels = c(0, 1))
  ) 

num_obs_ <- nrow(df_training_outcome)

Remove_Single_Valued_Variable <- function(removal_threshold, numeric_or_character) {
  
  variable_type <- switch (numeric_or_character,
                           "numeric" = is.numeric,
                           "character" = is.character
  )
  
  df_training_outcome %>% 
    select(
      where(variable_type)
    ) %>% 
    select(contains("VARNAMEPART")) %>% 
    pivot_longer(everything()) %>% 
    group_by(name, value) %>%
    summarize(
      count = n()
    ) %>% 
    slice_max(., order_by = count) %>% 
    distinct() %>% 
    mutate(
      pct_of_obs = count / num_obs_
    ) %>% 
    filter(pct_of_obs < removal_threshold) %>% 
    pull(name)
  
}

v_numeric <- Remove_Single_Valued_Variable(removal_threshold = .99, numeric_or_character = "numeric")
v_categorical <- Remove_Single_Valued_Variable(removal_threshold = .99, numeric_or_character = "character")

df_numeric <- df_training_outcome %>% 
  select(all_of(v_numeric))

v_var_less_than_5_unique <- df_numeric %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarize(
    n_distinct = n_distinct(value)
  ) %>%
  filter(n_distinct < 5) %>%
  pull(name)



v_var_5_or_more_unique <- setdiff(names(df_numeric), v_var_less_than_5_unique)

v_concat_dummies <- c(v_categorical, v_var_less_than_5_unique)
v_union_dummies <- union(v_var_less_than_5_unique, v_categorical)

stopifnot("There should not be overlap between categorical and numeric vars." =  
            length(v_concat_dummies) == length(v_union_dummies))

v_vars_to_dummy <- v_union_dummies
v_vars_to_discretize_and_dummy <- v_var_5_or_more_unique

df_selected_vars <- df_training_outcome %>% 
  select(cid, qtr, rand_no_cid, year_outcome, t_default, all_of(c(v_vars_to_dummy, v_vars_to_discretize_and_dummy)))

df_training <- df_selected_vars %>% 
  mutate(
    across(.cols = all_of(v_vars_to_dummy), ~ as.factor(.)), 
    across(.cols = all_of(v_vars_to_dummy), ~ fct_explicit_na(., na_level = "missing")),
    across(.cols = all_of(v_vars_to_discretize_and_dummy), ~ ifelse(is.na(.), -999999, .))
  ) 






for (i in 1:length(v_vars_to_discretize_and_dummy)) {
  
  var_ <- v_vars_to_discretize_and_dummy[i]
  
  file_name_smb <- dir_ls(path_smbinning) %>% 
    str_subset(glue("{var_}.rds"))
  
  smbinning_result <- read_rds(file_name_smb)
  
  col_ <- df_training[, var_]
  
  
  if(all(smbinning_result == "No significant splits")) {
    
    if (n_distinct(df_training[, var_]) < 5) {
      df_training[, var_] <- as.character(col_) 
    } else {
      cuts_ <- unique(c(-Inf, quantile(df_training[, var_]), Inf))
      df_training[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    }
  } else {
    
    cuts_ <- c(-Inf, smbinning_result$cuts, Inf)
    df_training[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    
  }
  print(glue("{i}/{length(v_vars_to_discretize_and_dummy)}"))
  
}


discretization_formula <- as.formula("t_default ~ . ")

dummies_formula <- df_training %>% 
  select(cid, qtr, rand_no_cid, year_outcome, all_of(v_vars_to_dummy)) %>%  
  names() %>% 
  reformulate(termlabels = .)

set.seed(1234)

tic(msg = "Discretizing and Creating Dummy Variables")

recipe_binning <-
  recipe(formula = discretization_formula, data = df_training) %>% 
  update_role(cid, qtr, rand_no_cid, year_outcome, new_role = "id") %>% 
  #step_discretize_xgb(all_of(v_vars_to_discretize_and_dummy), outcome = "t_default", id = "xgb_bins") %>%
  step_novel(all_of(v_vars_to_discretize_and_dummy), - all_outcomes()) %>% 
  step_dummy(all_of(v_vars_to_discretize_and_dummy), one_hot = FALSE) %>%
  step_intercept() %>% 
  prep(., strings_as_factors = FALSE) 

toc(log = TRUE)

tic(msg = "Creating Dummies")

recipe_dummies <- 
  recipe(formula = dummies_formula, data = df_training) %>% 
  update_role(cid, qtr, rand_no_cid, year_outcome, new_role = "id") %>% 
  step_novel(all_of(v_vars_to_dummy), - all_outcomes()) %>% 
  step_dummy(all_of(v_vars_to_dummy), one_hot = FALSE) %>%
  prep(., strings_as_factors = FALSE) 

toc(log = TRUE)



df_training_binned <- bake(recipe_binning, new_data = df_training)
df_training_dummied <- bake(recipe_dummies, new_data = df_training)



df_training_baked <- inner_join(df_training_binned, df_training_dummied, by = c("cid", "qtr", "rand_no_cid"))
names(df_training_baked)
f_ <- df_training_baked %>% 
  select(contains("VARNAMEPART")) %>% 
  names() %>% 
  reformulate(termlabels = ., response = "t_default")

tic(msg = "Fitting Logistic LASSO")
fit_logistic <- glmnet(f_, data = df_training_baked, alpha = 1, family = "binomial")
toc(log = TRUE)

mx_coefs <- coef(fit_logistic)

df_coefs_ungrouped <- tidy(mx_coefs) %>% 
  rename(
    variable = row,
    lambda_index = column,
    coefficient = value
  ) %>% 
  filter(variable != "(Intercept)") %>% 
  mutate(
    num_underscore = str_count(variable, "_")
  ) %>% 
  filter(num_underscore > 1, str_detect(variable, "VARNAMEPART")) %>% 
  mutate(
    underscore_locations = str_locate_all(variable, "_"), 
    last_underscore_location = map_dbl(underscore_locations, ~ max(.))
  ) %>% 
  transmute(
    lambda_index, 
    Root_Variable = str_sub(variable, 1, last_underscore_location - 1),
    variable
  )

path_save <- paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "lasso_selected_vars_",
                    TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/")


write_csv(df_coefs_ungrouped, paste0(path_save, "df_lasso_vars_all_lambdas.csv"))

df_coefs_grouped <- df_coefs_ungrouped %>% 
  group_by(lambda_index) %>% 
  summarize(
    unique_root_vars = length(unique(Root_Variable))
  ) 

lambda_index_100_root_vars_ <- df_coefs_grouped %>% 
  filter(unique_root_vars >= 100) %>% 
  arrange(unique_root_vars) %>% 
  slice(1) %>% 
  pull(lambda_index)

v_lasso_selected_vars <- df_coefs_ungrouped %>% 
  filter(lambda_index == lambda_index_100_root_vars_) %>% 
  pull(Root_Variable) %>% 
  unique()

df_lasso_selected_vars <- tibble(Variable = v_lasso_selected_vars) %>% 
  left_join(., DF_DATA_DICTIONARY, by = c("Variable" = "Variable Name"))


write_csv(df_lasso_selected_vars, paste0(path_save, "df_lasso_vars_and_dict.csv"))

df_timing_log <- tic.log() %>% 
  unlist() %>% 
  tibble(
    Process = str_extract(., pattern = ".+(?=:)"),
    Time = str_extract(., pattern = "(?<=: ).+")
  ) %>% 
  select(Process, Time)

tic.clear()
tic.clearlog()

write_csv(df_timing_log, paste0(path_save, "df_lasso_timing_log.csv"))



fit_logistic

